The Westminster Accounts, a recent collaboration between Sky News and Tortoise Media, examines the flow of money through UK politics. It does so by combining data from three key sources:
You can search and explore the results through the collaboration’s interactive database. Simon Willison has extracted a database and this is what we will be working with. If you want to read more about the project’s methodology.
The database made available by Simon Willison is an
SQLite database
sky_westminster <- DBI::dbConnect(
drv = RSQLite::SQLite(),
dbname = here::here("data", "sky-westminster-files.db")
)How many tables does the database have?
DBI::dbListTables(sky_westminster)## [1] "appg_donations" "appgs" "member_appgs" "members"
## [5] "parties" "party_donations" "payments"
donations <- dplyr::tbl(sky_westminster, "appg_donations")
members <- dplyr::tbl(sky_westminster, "members")
parties <- dplyr::tbl(sky_westminster, "parties")
payments <- dplyr::tbl(sky_westminster, "payments")
party_donations <- dplyr::tbl(sky_westminster, "party_donations")
glimpse(donations)## Rows: ??
## Columns: 9
## Database: sqlite 3.41.2 [/Users/admin/GiseDropBox Dropbox/Kostis Christodoulou/Teaching/LBS/2022-2023/E628 DSB/dsb2023-github7/data/sky-westminster-files.db]
## $ appg_name <chr> "'Left Behind' Neighbourhoods", "'Left Behind' N…
## $ category_name <chr> "benefits-in-kind", "benefits-in-kind", "benefit…
## $ date <chr> "2022-11-07", "2022-11-07", "2021-12-06", "2021-…
## $ description <chr> "Secretariat From : 17/06/2022 To : 16/06/2023…
## $ entity <chr> "Local Trust", "Local Trust", "Local Trust", "Lo…
## $ id <chr> "3fa80c194b19cbdc9188a956afcd7602", "74cbc9eef62…
## $ latest_register_entry <chr> "https://publications.parliament.uk/pa/cm/cmallp…
## $ paid_for <chr> "", "", "", "", "", "", "", "", "", "", "", "", …
## $ value <int> 121501, 60001, 120001, 1501, 106501, 40501, 3150…
glimpse(members)## Rows: ??
## Columns: 7
## Database: sqlite 3.41.2 [/Users/admin/GiseDropBox Dropbox/Kostis Christodoulou/Teaching/LBS/2022-2023/E628 DSB/dsb2023-github7/data/sky-westminster-files.db]
## $ id <chr> "m8", "m1508", "m1423", "m4514", "m1211", "m3958", "m14",…
## $ name <chr> "Theresa May", "Sir Geoffrey Cox", "Boris Johnson", "Keir…
## $ gender <chr> "F", "M", "M", "M", "M", "F", "M", "M", "F", "M", "F", "M…
## $ constituency <chr> "Maidenhead", "Torridge and West Devon", "Uxbridge and So…
## $ party_id <chr> "p4", "p4", "p4", "p15", "p4", "p4", "p4", "p4", "p4", "p…
## $ short_name <chr> "Mrs May", "Sir Geoffrey", "Mr Johnson", "Mr Starmer", "M…
## $ status <chr> "active", "active", "active", "active", "active", "active…
glimpse(payments)## Rows: ??
## Columns: 13
## Database: sqlite 3.41.2 [/Users/admin/GiseDropBox Dropbox/Kostis Christodoulou/Teaching/LBS/2022-2023/E628 DSB/dsb2023-github7/data/sky-westminster-files.db]
## $ category <chr> "4. Visits outside the UK", "2. (b) Any other sup…
## $ category_name <chr> "Gifts and other benefits", "Cash donations", "Gi…
## $ charity <chr> "", "", "", "", "", "", "", "", "", "", "", "", "…
## $ date <chr> "Registered in November 2021", "Registered in Jan…
## $ date_visited <chr> "Dates of visit: 5-12 November 2021", "", "Dates …
## $ description <chr> "International flights £805.07; accommodation £1,…
## $ destination_of_visit <chr> "Accra, Ghana", "", "Kingston, Jamaica", "", "", …
## $ entity <chr> "GUBA Foundation", "Mahir Kilic", "People's Natio…
## $ hours <chr> "", "", "", "", "", "", "", "", "", "", "", "", "…
## $ id <chr> "44a5c7f837d9df230b8c1e7f72eea188", "b9f40bd69ac2…
## $ member_id <chr> "m172", "m172", "m172", "m172", "m172", "m44", "m…
## $ purpose_of_visit <chr> "To participate in the GUBA Foundation Yaa Asante…
## $ value <dbl> 2631.51, 2000.00, 2574.57, 2000.00, 500.00, 1800.…
glimpse(party_donations)## Rows: ??
## Columns: 6
## Database: sqlite 3.41.2 [/Users/admin/GiseDropBox Dropbox/Kostis Christodoulou/Teaching/LBS/2022-2023/E628 DSB/dsb2023-github7/data/sky-westminster-files.db]
## $ date <chr> "2020-10-29", "2020-10-29", "2020-10-29", "2021-10-…
## $ donation_id <chr> "C0522788", "C0522787", "C0522863", "C0551159", "C0…
## $ entity <chr> "Aamer A Sarfraz", "Aamer A Sarfraz", "Aamer A Sarf…
## $ nature_of_donation <chr> "", "", "", "", "", "", "", "", "", "", "", "", "",…
## $ party_id <chr> "p4", "p4", "p4", "p1034", "p1034", "p1034", "p4", …
## $ value <dbl> 20000.00, 8000.00, 22000.00, 1000.00, 2000.00, 5000…
glimpse(parties)## Rows: ??
## Columns: 5
## Database: sqlite 3.41.2 [/Users/admin/GiseDropBox Dropbox/Kostis Christodoulou/Teaching/LBS/2022-2023/E628 DSB/dsb2023-github7/data/sky-westminster-files.db]
## $ abbrev <chr> "Alba", "Alliance", "Con", "DUP", "Green", "Ind", "Lab", "L…
## $ background <chr> "0015ff", "C0C0C0", "0000ff", "80", "78b82a", "C0C0C0", "ff…
## $ foreground <chr> "", "FFFFFF", "ffffff", "FFFFFF", "FFFFFF", "FFFFFF", "ffff…
## $ id <chr> "p1034", "p1", "p4", "p7", "p44", "p8", "p15", "p17", "p22"…
## $ name <chr> "Alba Party", "Alliance", "Conservative", "Democratic Union…
You need to work with the payments and
members tables and for now we just want the total among all
years. To insert a new, blank chunk of code where you can write your
beautiful code (and comments!), please use the following shortcut:
Ctrl + Alt + I (Windows) or cmd + option + I
(mac)
payments %>% # This is the payments DB table
group_by(member_id) %>%
summarise(total_payments = sum(value, na.rm=TRUE)) %>%
left_join(members, by = c("member_id"="id")) %>%
arrange(desc(total_payments)) %>%
collect() ## # A tibble: 595 × 8
## member_id total_payments name gender constituency party_id short_name status
## <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 m8 2809765. Ther… F Maidenhead p4 Mrs May active
## 2 m1508 2191387. Sir … M Torridge an… p4 Sir Geoff… active
## 3 m1423 1282402 Bori… M Uxbridge an… p4 Mr Johnson active
## 4 m4514 799936. Keir… M Holborn and… p15 Mr Starmer active
## 5 m1211 769373. Andr… M Sutton Cold… p4 Mr Mitche… active
## 6 m3958 712321. Fion… F Congleton p4 Ms Bruce active
## 7 m14 692438. John… M Wokingham p4 Mr Redwood active
## 8 m4483 546043 Rish… M Richmond (Y… p4 Mr Sunak active
## 9 m4097 538678. Liz … F South West … p4 Ms Truss active
## 10 m188 441681. Ed D… M Kingston an… p17 Mr Davey active
## # ℹ 585 more rows
entity that accounts for more than 5% of all
donations?Is there any entity whose donations account for more
than 5% of the total payments given to MPs over the 2020-2022 interval?
Who are they and who did they give money to?
payments %>%
group_by(entity, member_id ) %>% # who gives money
summarise(total = sum(value)) %>%
ungroup() %>%
left_join(members, by = c("member_id"="id")) %>%
arrange(desc(total)) %>%
collect() %>%
mutate(percent = round(100*total/sum(total), digits=2)) %>%
select(entity, name, party_id, total, percent)## # A tibble: 4,092 × 5
## entity name party_id total percent
## <chr> <chr> <chr> <dbl> <dbl>
## 1 Withers LLP Sir Geoffrey C… p4 1.81e6 5.25
## 2 Fiona Bruce and Co LLP Fiona Bruce p4 7.12e5 2.06
## 3 Charles Stanley John Redwood p4 6.75e5 1.95
## 4 Cambridge Speaker Series Theresa May p4 4.08e5 1.18
## 5 Centerview Partners LLP Boris Johnson p4 2.78e5 0.8
## 6 Council of Insurance Agents & Brokers Boris Johnson p4 2.76e5 0.8
## 7 Hindustan Times Boris Johnson p4 2.62e5 0.76
## 8 Unite Rebecca Long-B… p15 2.49e5 0.72
## 9 Emerging Asset Management Sir Bill Wiggin p4 2.32e5 0.67
## 10 Hutchison Ports Europe Chris Grayling p4 2.24e5 0.65
## # ℹ 4,082 more rows
# https://www.withersworldwide.com/en-gb/people/sir-geoffrey-cox-kc-mpentity donors give to a single party or not?# how many entities have donated
payments %>%
count(entity) %>%
collect()## # A tibble: 2,213 × 2
## entity n
## <chr> <int>
## 1 12 Property FE 1
## 2 1912 Club 1
## 3 39th Street Strategies LLC 1
## 4 3V International 1
## 5 5 Oceans Partnership 1
## 6 5x15 1
## 7 79 Borough Road (trading as 'The Ministry') 1
## 8 89UP 2
## 9 8hwe 2
## 10 97 Dining Club 1
## # ℹ 2,203 more rows
# 2213 organisations
# payments from entities to parties - who gives what
entities_unique <- payments %>%
left_join(members, by = c("member_id" = "id")) %>%
left_join(parties, by = c("party_id" = "id")) %>%
rename(member_name = name.x,
party_name = name.y) %>%
collect() %>%
group_by(entity, party_name, member_name) %>%
summarise(total_donations = sum(value, na.rm = TRUE)) %>%
mutate(prop = total_donations / sum(total_donations)) %>%
mutate(single_party = ifelse(prop == 1, TRUE, FALSE)) %>%
arrange(desc(total_donations))
entities_unique## # A tibble: 4,092 × 6
## # Groups: entity, party_name [2,469]
## entity party_name member_name total_donations prop single_party
## <chr> <chr> <chr> <dbl> <dbl> <lgl>
## 1 Withers LLP Conservat… Sir Geoffr… 1812732. 1 TRUE
## 2 Fiona Bruce and Co… Conservat… Fiona Bruce 711749. 1 TRUE
## 3 Charles Stanley Conservat… John Redwo… 674821. 1 TRUE
## 4 Cambridge Speaker … Conservat… Theresa May 408200 1 TRUE
## 5 Centerview Partner… Conservat… Boris John… 277724. 1 TRUE
## 6 Council of Insuran… Conservat… Boris John… 276130 1 TRUE
## 7 Hindustan Times Conservat… Boris John… 261652. 1 TRUE
## 8 Unite Labour Rebecca Lo… 249382 0.398 FALSE
## 9 Emerging Asset Man… Conservat… Sir Bill W… 232038. 1 TRUE
## 10 Hutchison Ports Eu… Conservat… Chris Gray… 223647. 1 TRUE
## # ℹ 4,082 more rows
entities_unique %>%
filter(single_party == TRUE) %>%
count()## # A tibble: 2,037 × 3
## # Groups: entity, party_name [2,037]
## entity party_name n
## <chr> <chr> <int>
## 1 12 Property FE Liberal Democrats 1
## 2 1912 Club Conservative 1
## 3 39th Street Strategies LLC Conservative 1
## 4 3V International Labour 1
## 5 5 Oceans Partnership Conservative 1
## 6 5x15 Labour 1
## 7 79 Borough Road (trading as 'The Ministry') Independent 1
## 8 89UP Labour 1
## 9 8hwe Conservative 1
## 10 97 Dining Club Conservative 1
## # ℹ 2,027 more rows
# 2037 give to a single party
entities_unique %>%
filter(single_party == FALSE) %>%
group_by(entity, party_name, member_name) %>%
summarise(total_given = sum(total_donations, na.rm = TRUE)) %>%
mutate(prop = total_given / sum(total_given)) ## # A tibble: 2,055 × 5
## # Groups: entity, party_name [432]
## entity party_name member_name total_given prop
## <chr> <chr> <chr> <dbl> <dbl>
## 1 ADS Group Conservative Jack Lopresti 940 0.662
## 2 ADS Group Conservative Laurence Robertson 480 0.338
## 3 APPG for the Armed Forces Conservative James Gray 480 0.429
## 4 APPG for the Armed Forces Conservative Rob Butler 315 0.282
## 5 APPG for the Armed Forces Conservative Robert Courts 323. 0.289
## 6 APPG for the Polar Regions Conservative James Gray 3459. 0.333
## 7 APPG for the Polar Regions Conservative Nigel Evans 3459. 0.333
## 8 APPG for the Polar Regions Conservative Tim Loughton 3459. 0.333
## 9 Adrian R B Johnson Conservative Scott Mann 6000 0.486
## 10 Adrian R B Johnson Conservative Sir Bernard Jenkin 6347 0.514
## # ℹ 2,045 more rows
I would like you to write code that generates the following table.
total_party_donations <- party_donations %>%
group_by(date, party_id) %>%
summarise(total_donations = sum(value, na.rm = TRUE)) %>%
ungroup() %>%
arrange(desc(total_donations)) %>%
left_join(parties, by = c("party_id"="id")) %>%
collect() %>%
mutate(date = lubridate::ymd(date), #lubridate doesn't worj on DB directly-- need to collect first
year = year(date)) %>%
group_by(year, name) %>%
summarise(total_year_donations = sum(total_donations)) %>%
mutate(prop = total_year_donations / sum(total_year_donations)) %>%
ungroup()
total_party_donations## # A tibble: 28 × 4
## year name total_year_donations prop
## <dbl> <chr> <dbl> <dbl>
## 1 2020 Alliance 105000 0.00150
## 2 2020 Conservative 42770782. 0.612
## 3 2020 Green Party 378068 0.00541
## 4 2020 Labour 13539803. 0.194
## 5 2020 Liberal Democrats 12717295. 0.182
## 6 2020 Plaid Cymru 70000 0.00100
## 7 2020 Scottish National Party 246284. 0.00352
## 8 2020 Sinn Féin 113892 0.00163
## 9 2021 Alba Party 53559. 0.00180
## 10 2021 Alliance 42500 0.00142
## # ℹ 18 more rows
… and then, based on this data, plot the following graph.
total_party_donations %>%
mutate(name = fct_rev(fct_reorder(name, total_year_donations, sum))) %>%
ggplot()+
aes(x=factor(year),
y = total_year_donations,
fill = name,
group = name)+
geom_col(position = "dodge")+
theme_light()+
scale_y_continuous(labels = scales::comma)+
labs(
fill = "Party",
title = "Conservatives have captured the majority of political donations",
subtitle = "Donations to UK political parties, 2020-2022",
x = NULL,
y = NULL
) +
theme(plot.title.position = "plot") # ensure title is top-left alignedThis uses the default ggplot colour pallete, as I dont want you to worry about using the official colours for each party. However, I would like you to ensure the parties are sorted according to total donations and not alphabetically. You may even want to remove some of the smaller parties that hardly register on the graph. Would facetting help you?
Finally, when you are done working with the databse, make sure you close the connection, or disconnect from the database.
dbDisconnect(sky_westminster)We will be using a dataset with anonymous
Covid-19 patient data that the CDC publishes every month. The file
we will use was released on April 11, 2023, and has data on 98 million
of patients, with 19 features. This file cannot be loaded in memory, but
luckily we have the data in parquet format and we will use
the {arrow} package.
The dataset cdc-covid-geography in in
parquet format that {arrow}can handle. It is > 600Mb and
too large to be hosted on Canvas or Github, so please download it from
dropbox https://www.dropbox.com/sh/q1yk8mmnbbrzavl/AAAxzRtIhag9Nc_hODafGV2ka?dl=0
and save it in your dsb repo, under the data
folder
## 0.045 sec elapsed
## FileSystemDataset with 1 Parquet file
## 97,799,772 rows x 19 columns
## $ case_month <string> "2021-09", "2022-09", "2022-01", "2020…
## $ res_state <string> "TX", "TX", "TX", "CA", "IL", "CA", "N…
## $ state_fips_code <int32> 48, 48, 48, 6, 17, 6, 36, 36, 36, 53, …
## $ res_county <string> "TARRANT", NA, "HARRIS", "SAN BERNARDI…
## $ county_fips_code <int32> 48439, NA, 48201, 6071, 17031, 6085, 3…
## $ age_group <string> "18 to 49 years", "18 to 49 years", "1…
## $ sex <string> "Male", "Male", "Female", "Female", "F…
## $ race <string> "White", "White", "Unknown", "Asian", …
## $ ethnicity <string> "Non-Hispanic/Latino", "Non-Hispanic/L…
## $ case_positive_specimen_interval <int32> NA, NA, NA, NA, 0, NA, 0, 0, 0, 0, 0, …
## $ case_onset_interval <int32> NA, NA, -1, NA, 0, NA, NA, NA, NA, 0, …
## $ process <string> "Missing", "Missing", "Missing", "Miss…
## $ exposure_yn <string> "Missing", "Missing", "Missing", "Miss…
## $ current_status <string> "Laboratory-confirmed case", "Probable…
## $ symptom_status <string> "Missing", "Missing", "Symptomatic", "…
## $ hosp_yn <string> "Missing", "Missing", "No", "No", "No"…
## $ icu_yn <string> "Missing", "Missing", "Missing", "Miss…
## $ death_yn <string> "Missing", "Missing", "Missing", "Miss…
## $ underlying_conditions_yn <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
Can you query the database and replicate the following plot?
# Covid CFR % by age group, sex and ICU entry yes/no
tic()
query2 <- cdc_data %>%
# dplyr commands like
# select, filter, group_by, summarise...
select(sex, age_group, death_yn, icu_yn) %>%
filter(sex %in% c("Male","Female"),
!is.na(age_group),
!(age_group=="Unknown"),
!(age_group=="Missing"),
death_yn %in% c("Yes","No"),
icu_yn %in% c("Yes","No")) %>% # select and clean the data
group_by(sex, age_group, death_yn, icu_yn) %>%
summarise(count = n()) |>
# execute query and retrieve results in a tibble (dataframe).
collect()
toc()## 3.845 sec elapsed
mydata2 <- query2 %>%
pivot_wider(names_from = death_yn,
values_from = count) %>%
clean_names() %>%
mutate(death_rate = yes/(no+yes)) # calculate death rate
mydata2 %>%
mutate(icu_yn = factor(icu_yn,
levels = c("Yes","No"),
labels = c("ICU Admission", "No ICU Admission"))) %>% # Turn `medcond_yn` from character to a factor variable with levels
ggplot(mapping=aes(x=death_rate, y=age_group)) + # draw a plot
geom_col(fill="#ff8f7c") +
facet_grid(rows = vars(icu_yn),
cols = vars(sex),
scales = "free_y")+
theme_light(
base_size = 12,
base_family = "",
base_line_size = 0.5,
base_rect_size = 0.5)+
labs(y=NULL,
x=NULL,
title = "Covid CFR % by age group, sex and ICU Admission",
caption = "Source: CDC")+
geom_text(aes(label = round(100*death_rate,0)),
vjust=0.5,
hjust=0.99,
colour = "black",
position = position_dodge(.9),
size = 4) +
scale_x_continuous(labels=scales::percent) +
theme(text=element_text(size=12, family="Montserrat"))+
# ensure title is top-left aligned
theme(plot.title.position = "plot")+
NULLThe previous plot is an aggregate plot for all three years of data. What if we wanted to plot Case Fatality Ratio (CFR) over time? Write code that collects the relevant data from the database and plots the following
tic()
query3 <- cdc_data %>%
# dplyr commands like
# select, filter, group_by, summarise...
select(sex, age_group, death_yn, icu_yn, case_month) %>%
filter(sex %in% c("Male","Female"),
!is.na(age_group),
!(age_group=="Unknown"),
!(age_group=="Missing"),
death_yn %in% c("Yes","No"),
icu_yn %in% c("Yes","No")) %>% # select and clean the data
group_by(sex, age_group, death_yn, icu_yn, case_month) %>%
summarise(count = n()) |>
# execute query and retrieve results in a tibble (dataframe).
collect()
toc()## 4.493 sec elapsed
mydata3 <- query3 %>%
pivot_wider(names_from = death_yn,
values_from = count) %>%
clean_names() %>%
mutate(death_rate = yes/(no+yes))
mydata3 %>%
filter(age_group != "0 - 17 years") %>%
mutate(
icu_yn = factor(icu_yn,
levels = c("Yes","No"),
labels = c("ICU Admission", "No ICU Admission"))) %>% # Turn `medcond_yn` from character to a factor variable with levels
ggplot(mapping=aes(x=case_month, y=death_rate, colour=age_group, group=age_group)) + # draw a plot
geom_line() +
facet_grid(rows = vars(icu_yn),
cols = vars(sex),
scales = "free_y")+
theme_light(
base_size = 8,
base_family = "",
base_line_size = 0.5,
base_rect_size = 0.5)+
labs(y=NULL,
x=NULL,
colour = "Age Group",
title = "Covid CFR % by age group, sex and ICU Admission",
caption = "Source: CDC")+
geom_text(aes(label = round(100*death_rate,0)), vjust=0.5, hjust=0, position = position_dodge(.9), size = 3) +
scale_y_continuous(labels=scales::percent) +
theme(text=element_text(size=12, family="Montserrat"))+
# ensure title is top-left aligned
theme(plot.title.position = "plot")+
theme(axis.text.x=element_text(angle=90,hjust=1)) +
theme(axis.text.x = element_text(size = 7))+
# scale_x_discrete(breaks = case_month[c(T,F,F)])+
theme(
# axis.line = element_line(color='black'),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
# panel.border = element_blank()
)+
NULL## Warning: Removed 24 rows containing missing values (`geom_line()`).
## Warning: Removed 141 rows containing missing values (`geom_text()`).
For each patient, the dataframe also lists the patient’s states and county FIPS code. The CDC also has information on the NCHS Urban-Rural classification scheme for counties
urban_rural <- read_xlsx(here::here("data", "NCHSURCodes2013.xlsx")) %>%
janitor::clean_names() Each county belongs in six diffent categoreis, with categories 1-4
being urban areas and categories 5-6 being rural, according to the
following criteria captured in x2013_code
Category name
Can you query the database, extract the relevant information, and reproduce the following two graphs that look at the Case Fatality ratio (CFR) in different counties, according to their population?
# query 4- CFR by type of county
tic()
query4 <- cdc_data %>%
select(sex, age_group, death_yn, icu_yn, case_month, county_fips_code) %>%
filter(sex %in% c("Male","Female"),
!is.na(age_group),
!(age_group=="Unknown"),
!(age_group=="Missing"),
death_yn %in% c("Yes","No"),
icu_yn %in% c("Yes","No")) %>% # select and clean the data
group_by(sex, age_group, death_yn, icu_yn, case_month, county_fips_code) %>%
summarise(count = n()) |>
# execute query and retrieve results in a tibble (dataframe).
collect()
toc()## 4.823 sec elapsed
query4_wide <- query4 %>%
pivot_wider(names_from = death_yn,
values_from = count) %>%
janitor::clean_names() %>%
drop_na(no) %>%
mutate(
yes = ifelse(is.na(yes),0,yes),
death_rate = yes/(no+yes))
plot_data <-
left_join(query4_wide, urban_rural, by=c("county_fips_code" = "fips_code"))%>%
drop_na(county_fips_code) %>%
mutate(
urban14_rural56 = case_when(
x2013_code == 5 | x2013_code == 6 ~ "Rural",
TRUE ~ "Urban"
)
)
# zones 1-6 split
# 1. Large central metro - 1 million or more population and contains the entire population of the largest principal city
# 2. large fringe metro - 1 million or more poulation, but does not qualify as 1
# 3. Medium metro - 250K - 1 million population
# 4. Small metropolitan population < 250K
# 5. Micropolitan
# 6. Noncore
library(ggrepel)
plot_data %>%
drop_na(x2013_code) %>%
group_by(x2013_code, case_month) %>%
summarise(totalyes = sum(yes),
totalno = sum(no),
death_rate = totalyes/(totalyes +totalno)) %>%
mutate(
category6 = case_when(
x2013_code == 1 ~ "1. Large central metro",
x2013_code == 2 ~ "2. Large fringe metro",
x2013_code == 3 ~ "3. Medium metro",
x2013_code == 4 ~ "4. Small metropolitan",
x2013_code == 5 ~ "5. Micropolitan",
x2013_code == 6 ~ "6. Noncore")) %>%
ggplot(aes(x=case_month, y = death_rate, colour=category6, group = category6))+
geom_line()+
theme_light()+
labs(y=NULL,
x=NULL,
title = "Covid CFR % by country population",
caption = "Source: CDC")+
geom_text_repel(aes(label = round(100*death_rate,1)), vjust=0.5, hjust=0, position = position_dodge(.9), size = 3) +
scale_y_continuous(labels=scales::percent) +
# ensure title is top-left aligned
theme(plot.title.position = "plot")+
facet_wrap(~category6, scales = "free", ncol=2)+
theme(legend.position = "none")+
theme(text=element_text(size=12, family="Montserrat"))+
# ensure title is top-left aligned
theme(plot.title.position = "plot")+
theme(axis.text.x=element_text(angle=90,hjust=1)) +
theme(axis.text.x = element_text(size = 7))+
theme(
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
)+
NULL## `summarise()` has grouped output by 'x2013_code'. You can override using the
## `.groups` argument.
# all 6 regions
plot_data %>%
group_by(urban14_rural56, case_month) %>%
summarise(totalyes = sum(yes),
totalno = sum(no),
death_rate = totalyes/(totalyes +totalno)) %>%
ggplot(aes(x=case_month, y = death_rate, colour=urban14_rural56, group = urban14_rural56))+
geom_line()+
theme_light()+
labs(y=NULL,
x=NULL,
title = "Covid CFR % by rural and urban areas",
caption = "Source: CDC",
colour = "Counties")+
geom_text(aes(label = round(100*death_rate,1)), vjust=0.5, hjust=0, colour = "black", position = position_dodge(.9), size = 3) +
scale_y_continuous(labels=scales::percent) +
theme(text=element_text(size=12, family="Montserrat"))+
# ensure title is top-left aligned
theme(plot.title.position = "plot")+
theme(axis.text.x=element_text(angle=90,hjust=1)) +
theme(axis.text.x = element_text(size = 7))+
theme(
# axis.line = element_line(color='black'),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
)+
NULL## `summarise()` has grouped output by 'urban14_rural56'. You can override using
## the `.groups` argument.
We will scrape and work with data foreign connected PACs that donate to US political campaigns. The data for foreign connected PAC contributions in the 2022 election cycle can be found at https://www.opensecrets.org/political-action-committees-pacs/foreign-connected-pacs/2022. Then, we will use a similar approach to get data such contributions from previous years so that we can examine trends over time.
All data come from OpenSecrets.org, a “website tracking the influence of money on U.S. politics, and how that money affects policy and citizens’ lives”.
library(robotstxt)
paths_allowed("https://www.opensecrets.org")## [1] TRUE
snake_case naming.base_url <- "https://www.opensecrets.org/political-action-committees-pacs/foreign-connected-pacs/"
year <- "2022"
url <- str_c(base_url, year)
tables <- url %>%
# get tables that exist on url
read_html() %>%
html_nodes(css="table") %>% # this will isolate all tables on page
html_table() # Parse an html table into a dataframe
tables## [[1]]
## # A tibble: 215 × 5
## `PAC Name (Affiliate)` Country of Origin/Pa…¹ Total Dems Repubs
## <chr> <chr> <chr> <chr> <chr>
## 1 Accenture (Accenture) Ireland/Accenture plc $3,0… $0 $3,000
## 2 Acreage Holdings Canada/Acreage Holdin… $0 $0 $0
## 3 Air Liquide America France/L'Air Liquide … $17,… $14,… $2,500
## 4 Airbus Group Netherlands/Airbus Gr… $193… $82,… $111,…
## 5 Alexion Pharmaceuticals (AstraZene… UK/AstraZeneca PLC $186… $104… $82,2…
## 6 Alkermes Inc Ireland/Alkermes Plc $84,… $34,… $50,0…
## 7 Allianz of America (Allianz) Germany/Allianz AG Ho… $31,… $20,… $11,0…
## 8 AMG Vanadium Netherlands/AMG Advan… $2,5… $0 $2,525
## 9 Anheuser-Busch (Anheuser-Busch InB… Belgium/Anheuser-Busc… $457… $218… $239,…
## 10 AON Corp (AON plc) UK/AON PLC $98,… $52,… $46,5…
## # ℹ 205 more rows
## # ℹ abbreviated name: ¹`Country of Origin/Parent Company`
## Use `tables[[1]]` to get first table
contributions <- tables[[1]] %>%
janitor::clean_names() %>%
#add a new column with the year
mutate(year=year)
# we just want to grab tablesClean the data:
total, dems, and repubs from
character strings to numeric values.country_of_origin_parent_company into two
such that country and parent company appear in different columns for
country-level analysis.# write a function to parse_currency
parse_currency <- function(x){
x %>%
# remove dollar signs
str_remove("\\$") %>%
# remove all occurrences of commas
str_remove_all(",") %>%
# convert to numeric
as.numeric()
}
# clean country/parent co and contributions
contributions <- contributions %>%
separate(country_of_origin_parent_company,
into = c("country", "parent"),
sep = "/",
extra = "merge") %>%
mutate(
total = parse_currency(total),
dems = parse_currency(dems),
repubs = parse_currency(repubs)
)Write a function called scrape_pac() that scrapes
information from the Open Secrets webpage for foreign-connected PAC
contributions in a given year. This function should
year. We will
want this information when we ultimately have data from all years, so
this is a good time to keep track of it. Our function doesn’t take a
year argument, but the year is embedded in the URL, so we can extract it
out of there, and add it as a new column. Use the str_sub()
function to extract the last 4 characters from the URL. You will
probably want to look at the help for this function to figure out how to
specify “last 4 characters”.Define the URLs for 2022, 2020, and 2000 contributions. Then, test your function using these URLs as inputs. Does the function seem to do what you expected it to do?
Construct a vector called urls that contains the
URLs for each webpage that contains information on foreign-connected PAC
contributions for a given year.
Map the scrape_pac() function over urls
in a way that will result in a data frame called
contributions_all.
Write the data frame to a csv file called
contributions-all.csv in the data
folder.
parse_currency <- function(x){
x %>%
# remove dollar signs
str_remove("\\$") %>%
# remove all occurrences of commas
str_remove_all(",") %>%
# convert to numeric
as.numeric()
}
# clean country/parent co and contributions
scrape_pac <- function(year) {
base_url <- "https://www.opensecrets.org/political-action-committees-pacs/foreign-connected-pacs/"
url <- str_c(base_url, year)
tables <- url %>%
# get tables that exist on url
read_html() %>%
html_nodes(css="table") %>% # this will isolate all tables on page
html_table() # Parse an html table into a dataframe
## Use `tables[[1]]` to get first table
contributions <- tables[[1]] %>%
janitor::clean_names() %>%
#add a new column with the year
mutate(year=year) %>%
separate(country_of_origin_parent_company,
into = c("country", "parent"),
sep = "/",
extra = "merge") %>%
mutate(
total = parse_currency(total),
dems = parse_currency(dems),
repubs = parse_currency(repubs)
)
return(contributions)
}years <- seq(from=2000, to=2022, by=2)
contributions_all <- map_df(years, scrape_pac)
glimpse(contributions_all)## Rows: 2,412
## Columns: 7
## $ pac_name_affiliate <chr> "7-Eleven", "ABB Group", "Accenture", "ACE INA", "A…
## $ country <chr> "Japan", "Switzerland", "UK", "UK", "Germany", "Ger…
## $ parent <chr> "Ito-Yokado", "Asea Brown Boveri", "Accenture plc",…
## $ total <dbl> 8500, 46000, 75984, 38500, 2000, 10500, 24000, 5825…
## $ dems <dbl> 1500, 17000, 23000, 12500, 2000, 10000, 10000, 1050…
## $ repubs <dbl> 7000, 28500, 52984, 26000, 0, 500, 14000, 47750, 15…
## $ year <dbl> 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 200…
contributions_all %>%
group_by(year, pac_name_affiliate) %>%
summarise(total_donations = sum(total, na.rm=TRUE)) %>%
mutate(perc = round(100*total_donations / sum(total_donations), digits=2)) %>%
arrange(desc(total_donations))## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
## # A tibble: 2,378 × 4
## # Groups: year [12]
## year pac_name_affiliate total_donations perc
## <dbl> <chr> <dbl> <dbl>
## 1 2020 UBS Americas 1615000 7.17
## 2 2008 Anheuser-Busch 1505897 8.42
## 3 2014 UBS Americas 1481250 6.9
## 4 2022 UBS Americas 1470000 7.38
## 5 2016 UBS Americas 1445490 6.06
## 6 2018 UBS Americas 1389750 5.76
## 7 2020 Toyota Motor North America 1161642 5.16
## 8 2022 Toyota Motor North America 1053977 5.29
## 9 2006 GlaxoSmithKline 891413 7.1
## 10 2018 Toyota Motor North America 874999 3.63
## # ℹ 2,368 more rows
The website https://www.consultancy.uk/jobs/ lists job openings for consulting jobs.
library(robotstxt)
paths_allowed("https://www.consultancy.uk") #is it ok to scrape?##
www.consultancy.uk
## [1] TRUE
base_url <- "https://www.consultancy.uk/jobs/page/1"
listings_html <- base_url %>%
read_html()Identify the CSS selectors in order to extract the relevant information from this page, namely
Can you get all pages of ads, and not just the first one,
https://www.consultancy.uk/jobs/page/1 into a
dataframe?
Write a function called scrape_jobs() that scrapes
information from the webpage for consulting positions. This function
should
have one input: the URL of the webpage and should return a data frame with four columns (variables): job, firm, functional area, and type
Test your function works with other pages too, e.g., https://www.consultancy.uk/jobs/page/2. Does the function seem to do what you expected it to do?
Given that you have to scrape ...jobs/page/1,
...jobs/page/2, etc., define your URL so you can join
multiple stings into one string, using str_c(). For
instnace, if page is 5, what do you expect the following
code to produce?
base_url <- "https://www.consultancy.uk/jobs/page/1"
url <- str_c(base_url, page)
Construct a vector called pages that contains the
numbers for each page available
Map the scrape_jobs() function over
pages in a way that will result in a data frame called
all_consulting_jobs.
Write the data frame to a csv file called
all_consulting_jobs.csv in the data
folder.
get_listings <- function(page) {
base_url <- "https://www.consultancy.uk/jobs/page/"
url <- str_c(base_url, page)
listings_html <- read_html(url)
job <- listings_html %>%
html_nodes(css = "span.title") %>%
html_text2()
firm <- listings_html %>%
html_nodes(css = ".hide-phone .row-link") %>%
html_text2()
link <- listings_html %>%
html_nodes(css = ".hide-phone .row-link") %>%
html_attr('href') %>%
str_c("https://www.consultancy.uk", .)
functional_area <- listings_html %>%
html_elements(css = ".initial") %>%
html_text2()
type <- listings_html %>%
html_nodes(css = ".hide-tablet-landscape .row-link") %>%
html_text2()
jobs_df <- tibble(
job = job,
firm = firm,
functional_area = functional_area,
type = type,
link = link
)
return(jobs_df)
}
pages <- 1:8 # apply to the first 8 pages; if more, change to 1:X
jobs <- map_df(pages, get_listings)
glimpse(jobs)## Rows: 338
## Columns: 5
## $ job <chr> "Intermediate Quantity Surveyor", "Consultant Roles (a…
## $ firm <chr> "Panoptic Consultancy Group", "Mason Advisory", "The U…
## $ functional_area <chr> "Project Management", "Digital", "Strategy", "Sales", …
## $ type <chr> "Job", "Job", "Job", "Job", "Job", "Job", "Job", "Job"…
## $ link <chr> "https://www.consultancy.uk/jobs/33696/panoptic-consul…
jobs %>%
count(firm, sort=TRUE)## # A tibble: 38 × 2
## firm n
## <chr> <int>
## 1 PA Consulting 98
## 2 FTI Consulting 76
## 3 PwC 33
## 4 Capgemini Invent 26
## 5 B2E Consulting 9
## 6 Valcon 8
## 7 Yonder Consulting 8
## 8 Genioo 6
## 9 ThreeTwoFour 6
## 10 Ayming 5
## # ℹ 28 more rows
jobs %>%
count(functional_area, sort=TRUE) %>%
filter(functional_area != "Unknown") %>%
mutate(perc = n/sum(n))## # A tibble: 31 × 3
## functional_area n perc
## <chr> <int> <dbl>
## 1 Strategy 31 0.124
## 2 Management 25 0.100
## 3 Data Science 24 0.0964
## 4 Finance 15 0.0602
## 5 IT Strategy 15 0.0602
## 6 Software 15 0.0602
## 7 Digital 14 0.0562
## 8 Project Management 12 0.0482
## 9 Cyber Security 10 0.0402
## 10 Human Resources 9 0.0361
## # ℹ 21 more rows